In [1]:
#Import all the necessary libraries

import pandas as pd # Used for data wrangling

import plotly.express as px
import matplotlib.pyplot as plt # Used for data visualization

from statsmodels.tsa.ar_model import AutoReg # The model to predict timeseries

from sklearn.metrics import mean_absolute_error, r2_score # Evaluation function

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf #Auto correlation visualization.

import plotly.io as pio 
pio.renderers.default='notebook' # Used to render plotly viz to stay on jupyter.


import warnings
warnings.filterwarnings('ignore') #Filter warnings
In [2]:
temperature_df = pd.read_csv("data/temperature.csv") # read the temperature data
In [3]:
# Mask the datetime column and the city we will be working with 
#(I.e only extract the columns we will be working with).

df = temperature_df[["datetime", "New York"]] 
In [4]:
#Rename newyork to temperature

df = df.rename(columns = {"New York": "Temperature"})
# convert to datatime object
df["datetime"] = pd.to_datetime(df["datetime"]) 
In [5]:
# Set the index to the datetime column and then convert the dataframe into series
df = df.set_index("datetime").squeeze()
In [6]:
#Localize the Timestamp From UTC to A newyork time stamp 
df.index = df.index.tz_localize("UTC").tz_convert('America/New_York')
In [7]:
# Resample it into 1 hour frequency
df = df.resample("H").mean()
In [8]:
df.head()
Out[8]:
datetime
2012-10-01 08:00:00-04:00           NaN
2012-10-01 09:00:00-04:00    288.220000
2012-10-01 10:00:00-04:00    288.247676
2012-10-01 11:00:00-04:00    288.326940
2012-10-01 12:00:00-04:00    288.406203
Freq: H, Name: Temperature, dtype: float64
In [9]:
# Check for missing values
df.isna().sum()
Out[9]:
793

We have 793 records that have missing values.

Exploratory Data Analysis¶

In [10]:
df.describe()
Out[10]:
count    44460.000000
mean       285.400406
std         10.220932
min        250.774000
25%        277.370000
50%        285.870000
75%        293.760000
max        310.240000
Name: Temperature, dtype: float64

We can see from the above summary that the minimum temperature is approximately 250 and the maximum is approximately 310

Distribution of the Temperature Readings¶

In [11]:
fig = px.histogram(
    df,
    nbins=50,
    title = "Distribution of the Temperature Readings"
)
fig.show()

We can see from the above summary that the temperature of Newyork is normally distributed over the years.

Temperature Readings In One Hour¶

In [12]:
fig = px.line(
    df,
    title="Readings of the Temperature in the Window of 1HR over the Years."
)

fig.show()

There is a trend with the temperature in new york as we can see that the readings of the temperature over the years are in form of a sine wave.

Readings of the Temperature in the Window of 24HR (i.e) A day.¶

In [13]:
mask = df.resample("D").mean()
fig = px.line(
    mask,
    title="Readings of the Temperature in the Window of 24HR over the Years.",
    hover_name=mask.index
)
fig.show()

We can see from the above dataset that the day that has the lowest number of temperature was 20th of febraury 2015 which the value is 258.5913 or -14.56 celsius approximately

Also we can see from the above dataset that the day which have the highest temperature was 2oth of July 2013 with a kelvin value of 304.77 equivalent to 31.62 degree celsius which makes it the third hottest temperature in the history of New york

In [14]:
fig = px.line(
    df.resample("M").mean(),
    title="Readings of the Temperature in the Window of 1 Month over the Years."
)

fig.show()

We can see from the above summary that Febraury 2015 was the month which had the lowest temperature value on average over the years.

Also July 2013 was the month that had the highest temperature over years on average.

In [15]:
fig = px.line(
    df.resample("Q").mean(),
    title="Readings of the Temperature in the Window of Quater over the Years."
)

fig.show()

The first quarter of 2015 was the quarter that had the highest dip over the years in NewYork

In [16]:
df.resample("A").mean()
Out[16]:
datetime
2012-12-31 00:00:00-05:00    281.453499
2013-12-31 00:00:00-05:00    285.342590
2014-12-31 00:00:00-05:00    284.515356
2015-12-31 00:00:00-05:00    284.949812
2016-12-31 00:00:00-05:00    286.055159
2017-12-31 00:00:00-05:00    287.504149
Freq: A-DEC, Name: Temperature, dtype: float64
In [17]:
fig = px.line(
    df.resample("A").mean(),
    title="Readings of the Temperature at the End of Each Year."
)

fig.show()

We can see upward trend of temperature value moving over the years. This could result becuase of global warming.

Modelling¶

Filling with backward fill¶

In [18]:
df.fillna(method='bfill', inplace=True)

Filling also with forward fill¶

In [19]:
df.fillna(method="ffill", inplace=True)
In [20]:
df.head()
Out[20]:
datetime
2012-10-01 08:00:00-04:00    288.220000
2012-10-01 09:00:00-04:00    288.220000
2012-10-01 10:00:00-04:00    288.247676
2012-10-01 11:00:00-04:00    288.326940
2012-10-01 12:00:00-04:00    288.406203
Freq: H, Name: Temperature, dtype: float64

Plotting the Auto Correlation Function.¶

In [21]:
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(df, ax=ax)
plt.xlabel("Lag` [hours]")
plt.ylabel("Correlation Coefficient");

Plotting the Partial Correlation Coeffiecient¶

In [22]:
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(df, ax=ax, method="ywm")
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");

Spliitng the Dataset¶

In [23]:
cutoff_test = int(len(df) * 0.9) # Using 90% of the data for training
y_train = df.iloc[:cutoff_test]
y_test = df.iloc[cutoff_test:]

Base line (Creating a Dummy Prediction)¶

In [24]:
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))
Mean P2 Reading: 284.59
Baseline MAE: 8.57

We used find the mean of the all the entire temperature to serve as the prediction then we checked the metrics which resulted into 8.23. So therfore we need to build a model that its metrics must be lower than that.

Model (Evaluation and HyperParameter Tuning)¶

In [25]:
p_params = range(1, 45)
maes = []
for p in p_params:
    model = AutoReg(y_train, lags = p).fit()
    y_preds = model.predict().dropna()
    maes.append(mean_absolute_error(y_train[p:], y_preds))
mae_series = pd.Series(maes, name="mae", index=p_params)

mae_series.sort_values().head()
Out[25]:
43    0.539295
44    0.539317
42    0.539919
41    0.540170
40    0.540445
Name: mae, dtype: float64

We can see that the 43 hyperparameter can help our model to accurately predict the time series as it mean absolute error was the lowest.

Building the Model Last Phase Using Walk Forward Validation¶

In [26]:
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
    model_wfv = AutoReg(history, lags=43).fit()
    next_pred = model_wfv.forecast()
    y_pred_wfv =y_pred_wfv.append(next_pred)
    history = history.append(y_test[next_pred.index])
y_pred_wfv.head()
Out[26]:
2017-05-25 07:00:00-04:00    286.339709
2017-05-25 08:00:00-04:00    286.645965
2017-05-25 09:00:00-04:00    286.813751
2017-05-25 10:00:00-04:00    287.104280
2017-05-25 11:00:00-04:00    287.217840
Freq: H, dtype: float64
In [27]:
result = {"y_test":y_test, "y_pred_wfv":y_pred_wfv}
df_pred_test = pd.DataFrame(result)
fig = px.line(df_pred_test)
fig.update_layout(
    title="Weather Forecasting, WFV Predictions",
    xaxis_title="Date",
    yaxis_title="PM2.5 Level",
)

Evaluate the Performance of the Last Model¶

In [28]:
mean_absolute_error(y_test, y_pred_wfv)
Out[28]:
0.36939025645673274
In [29]:
r2_score(y_test, y_pred_wfv)
Out[29]:
0.9907992326314755

We can see that our model performed well with 99% accuracy